#-------------------------------------------------------#
#   Quality Assessment of the Academic Freedom Index    #
#-------------------------------------------------------#

# Title: "Quality Assessment of the Academic Freedom Index: Strengths, Weaknesses, and How Best to Use It" 
# Authors: "Lott, Lars", "Spannagel, Janika"
# date: 2024-09-19
# journal: Perspectives on Politics
# DOI: TBA
# written under "R and RStudio (4.4.1)"


#### Preliminaries ####

R.version$version.string

# clear workspace
rm(list=ls())

# load packages
library(here)
library(tidyverse)
library(magrittr)
library(reshape2)
library(ggpubr)
library(countrycode)
library(ggrepel)
library(ggthemes)
library(lubridate)
library(estimatr)
library(texreg)
library(readxl)
library(haven)
library(scales)
library(marginaleffects)

set.seed(1234)

#---------------------------------------------------#
#--------------------load data ---------------------#
#---------------------------------------------------#

#load coder-level data 
coder_level_ds <- read.csv(file.path("data/vdem_v13",  "vdem_13_coder_level", "coder_level_ds_v13.csv"))
gc()

#load coder characteristics
coder_characteristics_wide <- readRDS("data/coder_characteristics_wide_simulated_data.rds")

# load country_aggregated data

vdem_full_v13 <- readRDS(file.path(data_storage, "vdem_13_cy", "V-Dem-CY-Full+Others-v13.rds"))

episodes_data <- read.csv(file.path(data_storage, "episode_without_uncertainty_interval_test.csv"))

gc()
#---------------------------------------------------#
#---------------Coder Disagreement -----------------#
#---------------------------------------------------#

# Coder Disagreement #

list_var <- c("v2cafres", "v2cafexch", "v2cainsaut", "v2casurv", "v2clacfree")

#### Statistical Analysis ####

coder_level_ds <- coder_level_ds %>%
  left_join(coder_characteristics_wide, by = ("coder_id")) %>%
  mutate(reside_in_country = ifelse(country_id == reside, 1, 
                                    ifelse(country_id != reside, 0, NA)))


temp_i <- coder_level_ds %>%
  mutate(historical_date = ymd(historical_date)) %>%
  group_by(country_id, historical_date) %>%
  summarize(sd_v2cafres = sd(v2cafres, na.rm = TRUE), 
            n_v2cafres = sum(!is.na(v2cafres)), 
            sd_v2cafexch = sd(v2cafexch, na.rm = TRUE), 
            n_v2cafexch =  sum(!is.na(v2cafexch)),
            sd_v2cainsaut = sd(v2cainsaut, na.rm = TRUE), 
            n_v2cainsaut =  sum(!is.na(v2cainsaut)),
            sd_v2casurv = sd(v2casurv, na.rm = TRUE), 
            n_v2casurv =  sum(!is.na(v2casurv)),
            sd_v2clacfree = sd(v2clacfree, na.rm = TRUE), 
            n_v2clacfree =  sum(!is.na(v2clacfree)), 
            v2cafres_conf = mean(v2cafres_conf, na.rm = TRUE), 
            v2cafexch_conf = mean(v2cafexch_conf, na.rm = TRUE), 
            v2cainsaut_conf = mean(v2cainsaut_conf, na.rm = TRUE), 
            v2casurv_conf = mean(v2casurv_conf, na.rm = TRUE), 
            v2clacfree_conf = mean(v2clacfree_conf, na.rm = TRUE))

number_of_coders <- coder_level_ds %>%
  mutate(historical_date = ymd(historical_date)) %>%
  group_by(country_id, historical_date) %>%
  summarize(number_coders = n_distinct(coder_id), 
            number_of_extern_coders = sum(reside_in_country, na.rm = T)/number_coders)

temp_i <- temp_i %>%
  left_join(number_of_coders, by = c("country_id", "historical_date"))

vdem_subset_v13 <- vdem_full_v13 %>%
  mutate(historical_date = ymd(historical_date)) %>%
  dplyr::select(country_id, country_name, year, historical_date, v2cafres, v2cafexch, v2cainsaut, v2casurv, v2clacfree, v2xca_academ,
                v2x_polyarchy, v2x_liberal, v2x_libdem, e_regionpol_6C,e_gdppc, e_pop, e_cow_exports,e_cow_imports, v2x_freexp, e_regionpol) %>%
  left_join(temp_i, by = c("country_id", "historical_date")) %>%
  filter(year >=1900)

vdem_subset_v13$e_regionpol_6C <- as.factor(vdem_subset_v13$e_regionpol_6C)

summary(vdem_subset_v13$number_of_extern_coders)

episodes_data <- episodes_data %>%
  dplyr::select(-c(X, starts_with("v2xca_academ")))

vdem_subset_v13 <- vdem_subset_v13 %>%
  left_join(episodes_data, by = c("country_id", "country_name", "year"))
  
vdem_subset_v13 <- vdem_subset_v13 %>%
  mutate(decline_growth_episode = ifelse(increase_episode == 1, "Growth Episode", 
                                         ifelse(decline_episode == 1, "Decline Episode", "No episode")))

table(vdem_subset_v13$decline_growth_episode)
#Decline Episode  Growth Episode      No episode 
#816              1105                17399 

summary_statistics_coder_disagreement <- vdem_subset_v13 %>%
  group_by(decline_growth_episode) %>%
  summarize(sd_v2cafres = mean(sd_v2cafres, na.rm = T), 
            sd_v2cafexch = mean(sd_v2cafexch, na.rm = T), 
            sd_v2cainsaut = mean(sd_v2cainsaut, na.rm = T), 
            sd_v2casurv = mean(sd_v2casurv, na.rm = T), 
            sd_v2clacfree = mean(sd_v2clacfree, na.rm = T))



vdem_subset_v13_pooled <- vdem_subset_v13 %>%
  pivot_longer(!c(country_id, country_text_id, country_name, year, v2xca_academ, 
                  starts_with("increase_"), starts_with("decline_"), 
                  historical_date, v2cafres, v2cafexch, v2cainsaut, v2casurv, v2clacfree, 
                  v2x_polyarchy, v2x_liberal, v2x_libdem, e_regionpol_6C,e_gdppc, e_pop, 
                  e_cow_exports,e_cow_imports, v2x_freexp, e_regionpol, number_coders,
                  number_of_extern_coders, v2cafres_conf, v2cafexch_conf, v2cainsaut_conf, 
                  v2casurv_conf, v2clacfree_conf, n_v2cafres, n_v2cafexch, n_v2cainsaut, 
                  n_v2casurv, n_v2clacfree), names_to = "indicator", values_to = "value_indicator") %>%
  drop_na(value_indicator) %>%
  mutate(v2xca_academ_sqr  = v2xca_academ^2) %>%
  filter(year >=1900) %>%
  mutate(confidence_coders_indidator = case_when(indicator == "sd_v2cafres" ~v2cafres_conf,
                                                 indicator == "sd_v2cafexch" ~v2cafexch_conf,
                                                 indicator == "sd_v2cainsaut" ~v2cainsaut_conf,
                                                 indicator == "sd_v2casurv" ~v2casurv_conf,
                                                 indicator == "sd_v2clacfree" ~v2clacfree_conf))


## plotting ##


globalVariables(c("layer",  "GeomFlatViolin"))

geom_flat_violin <- function(mapping = NULL, data = NULL, stat = "ydensity",
                             position = "dodge", trim = TRUE, scale = "area",
                             show.legend = NA, inherit.aes = TRUE, ...) {
  layer(
    data = data,
    mapping = mapping,
    stat = stat,
    geom = GeomFlatViolin,
    position = position,
    show.legend = show.legend,
    inherit.aes = inherit.aes,
    params = list(
      trim = trim,
      scale = scale,
      ...
    )
  )
}


#' ggplot2-ggproto
#' @rdname ggplot2-ggproto
#' @format NULL
#' @usage NULL
#' @export
GeomFlatViolin <-
  ggproto("GeomFlatViolin", Geom,
          setup_data = function(data, params) {
            data$width <- data$width %||%
              params$width %||% (resolution(data$x, FALSE) * 0.9)
            
            # ymin, ymax, xmin, and xmax define the bounding rectangle for each group
            data %>%
              dplyr::group_by(group) %>%
              dplyr::mutate(
                ymin = min(y),
                ymax = max(y),
                xmin = x,
                xmax = x + width / 2
              )
          },
          
          draw_group = function(data, panel_scales, coord) {
            # Find the points for the line to go all the way around
            data <- transform(data,
                              xminv = x,
                              xmaxv = x + violinwidth * (xmax - x)
            )
            
            # Make sure it's sorted properly to draw the outline
            newdata <- rbind(
              plyr::arrange(transform(data, x = xminv), y),
              plyr::arrange(transform(data, x = xmaxv), -y)
            )
            
            # Close the polygon: set first and last point the same
            # Needed for coord_polar and such
            newdata <- rbind(newdata, newdata[1, ])
            
            ggplot2:::ggname("geom_flat_violin", GeomPolygon$draw_panel(newdata, panel_scales, coord))
          },
          
          draw_key = draw_key_polygon,
          
          default_aes = aes(
            weight = 1, colour = "grey20", fill = "white", size = 0.5,
            alpha = NA, linetype = "solid"
          ),
          
          required_aes = c("x", "y")
  )

label_facets <- list(
  'sd_v2cafexch'="v2cafexch",
  'sd_v2cafres'="v2cafres",
  'sd_v2cainsaut'="v2cainsaut",
  'sd_v2casurv'="v2casurv", 
  'sd_v2clacfree' = 'v2clacfree'
)

my_comparisons <- list( c("No episode", "Decline Episode"), c("No episode", "Growth Episode"))

ggplot(vdem_subset_v13_pooled, aes(x=decline_growth_episode, y = value_indicator, 
                                   fill = decline_growth_episode, colour = decline_growth_episode))+
  geom_flat_violin(position = position_nudge(x = .25, y = 0),adjust =2, trim = FALSE) +
  geom_boxplot(aes(x =decline_growth_episode, y = value_indicator), outlier.shape = NA, alpha = 0.3, 
               width = .1, colour = "BLACK") +
  stat_summary(geom="text", fun=mean,
               aes(label=sprintf("%1.3f", ..y..)), color = "black",
               position=position_nudge(x= 0.4), size=3.5) +
  ylab('Coder Disagreement')+ xlab('') +
  scale_y_continuous(limits = c(-0.5,3)) +
  #coord_flip() + 
  facet_wrap(~indicator, labeller = as_labeller(label_facets)) + 
  guides(fill = "none", colour = "none") +
  theme_pubr() + 
  theme(axis.text.x = element_text(angle = 45, vjust =, hjust=1)) +
  scale_colour_brewer(palette = "Dark2")+
  scale_fill_brewer(palette = "Dark2")

ggsave("outputs_simulated_data/Figure_J1.pdf", height = 20, width = 25, units = "cm", dpi = 1000)
ggsave("outputs_simulated_data/Figure_J1.png", height = 20, width = 25, units = "cm", dpi = 1000)


################################################################################
######################## Individual Coder level Analysis #######################
################################################################################

## merge country-year data into coder_level ds ##

vdem_subset_v13 <- vdem_subset_v13 %>%
  dplyr::select(country_id, year, historical_date, v2x_polyarchy, v2x_liberal, v2x_libdem, e_regionpol_6C,e_gdppc, 
                e_pop, e_cow_exports,e_cow_imports, starts_with("decline"), starts_with("increase"))
  

coder_level_ds_subset <- coder_level_ds %>%
  dplyr::select(country_text_id, country_id, coder_id, historical_date, starts_with(c("v2cafres", "v2cafexch", "v2cainsaut", 
                                                                                      "v2casurv", "v2clacfree")), gender,
                phd, government_employment, age_block, reside, main_country_id, reside_in_country, 
                v2zzfremrk, v2zzelcdem, v2zzlibdem, v2zztimespent, v2zzsatisf, v2zzfirstreas) %>%
  mutate(historical_date = ymd(historical_date)) %>%
  left_join(vdem_subset_v13, by = c("country_id", "historical_date")) %>%
  filter(year >= 1900)

coder_level_ds_subset$gender <- as.factor(coder_level_ds_subset$gender)
coder_level_ds_subset$age_block <- as.factor(coder_level_ds_subset$age_block)
coder_level_ds_subset$phd <- as.factor(coder_level_ds_subset$phd)
coder_level_ds_subset$government_employment <- as.factor(coder_level_ds_subset$government_employment)
coder_level_ds_subset$reside <- as.factor(coder_level_ds_subset$reside)
coder_level_ds_subset$e_regionpol_6C <- as.factor(coder_level_ds_subset$e_regionpol_6C)
coder_level_ds_subset$reside_in_country <- as.factor(coder_level_ds_subset$reside_in_country)

#### Example Cases: US, Brazil, and Israel ####

summary(coder_level_ds_subset$year)
exmaple_cases <- c("USA", "BRA", "IND")

dataset_examples <- coder_level_ds_subset %>%
  filter(country_text_id %in% exmaple_cases & year >=2010) %>%
  dplyr::select(country_id, country_text_id, coder_id, historical_date, v2cafres, v2cafexch, v2cainsaut, v2casurv, v2clacfree, reside_in_country, main_country_id) %>%
  pivot_longer(!c(country_id, country_text_id, coder_id, historical_date, reside_in_country, main_country_id), names_to = "indicator", values_to = "value_indicator")

dataset_examples <- dataset_examples %>%
  mutate(reside_in_country = ifelse(reside_in_country==1, "Reside in country", "Not reside in country")) %>%
  drop_na(value_indicator) %>%
  mutate(reside_in_country = ifelse(is.na(reside_in_country), "Not reside in country",reside_in_country))
  

ggplot(dataset_examples, aes(x=reside_in_country, y = value_indicator, 
                            fill = reside_in_country, colour = reside_in_country))+
  geom_jitter(aes(shape = indicator), alpha = 0.3) +
  geom_boxplot(aes(x =reside_in_country, y = value_indicator), outlier.shape = 2, alpha = 0.7, 
               width = .2, colour = "BLACK") +
  ylab('Coder Scores')+ xlab('') +
  labs(shape = "") +
  scale_y_continuous(limits = c(0,4)) +
  facet_wrap(~country_text_id, labeller = as_labeller(label_facets)) + 
  guides(fill = "none", colour = "none") +
  theme_pubr() + 
  theme(axis.text.x = element_text(angle = 45, vjust =, hjust=1)) +
  scale_colour_brewer(palette = "Dark2")+
  scale_fill_brewer(palette = "Dark2")

ggsave("outputs_simulated_data/Figure_J2.pdf", height = 15, width = 25, units = "cm", dpi = 1000)
ggsave("outputs_simulated_data/Figure_J2.png", height = 15, width = 25, units = "cm", dpi = 1000)


#summary statistics

summary_experts <- dataset_examples %>%
  dplyr::group_by(reside_in_country, country_text_id) %>%
  dplyr::summarize(n = unique(coder_id)) %>%
  dplyr:: group_by(reside_in_country, country_text_id) %>%
  dplyr::count()

#### Regression Analysis ####

coder_level_ds_subset_pooled <- coder_level_ds_subset %>%
  dplyr::select(country_text_id, country_id, coder_id, historical_date, year, v2cafres, v2cafexch, v2cainsaut, v2casurv, v2clacfree,
                v2zzfremrk, v2zzelcdem, v2zzlibdem, v2zztimespent, v2zzsatisf, v2zzfirstreas, v2x_polyarchy, v2x_liberal, v2x_libdem,
                e_regionpol_6C,e_gdppc, e_pop, e_cow_exports,e_cow_imports, gender, age_block, phd, government_employment,reside_in_country, 
                increase_episode, decline_episode) %>%
  pivot_longer(!c(country_text_id, country_id, coder_id, historical_date, year,v2zzfremrk, v2zzelcdem, v2zzlibdem, v2zztimespent, 
                  v2zzsatisf, v2zzfirstreas,  v2x_polyarchy, v2x_liberal, v2x_libdem, e_regionpol_6C,e_gdppc, e_pop,
                  e_cow_exports,e_cow_imports, gender, age_block, phd, government_employment, reside_in_country, 
                  increase_episode, decline_episode), names_to = "indicator", values_to = "coder_value_indicator") %>%
  drop_na(coder_value_indicator) %>%
  mutate(gender = case_when(gender == 0 ~ "Male", gender == 1 ~ "Female"), 
         age_block = case_when(age_block == 0 ~ "First Block", 
                               age_block == 1 ~ "Second Block", 
                               age_block == 2 ~ "Third Block"), 
         phd = case_when(phd == 0 ~ "No PhD", phd == 1 ~ "PhD"), 
         government_employment = case_when(government_employment == 0 ~ "Government employed", government_employment == 1 ~ "No government employed"), 
         reside_in_country_coded = case_when(reside_in_country == 0 ~ "Not reside in country", reside_in_country == 1 ~ "Reside in country"))

summary(coder_level_ds_subset_pooled)

coder_level_ds_subset_pooled <- coder_level_ds_subset_pooled %>%
  mutate(decline_growth_episode = ifelse(increase_episode == 1, "Growth Episode", 
                                         ifelse(decline_episode == 1, "Decline Episode", "No episode")))



## pooled regression analysis ##
m1 <- lm_robust(coder_value_indicator ~ gender + age_block + phd + government_employment + reside_in_country_coded +
                  decline_growth_episode + v2zzfremrk + v2zzelcdem + v2zzlibdem + v2zztimespent + v2x_liberal + v2x_polyarchy + 
                  v2zzlibdem:v2x_liberal + v2zzelcdem:v2x_polyarchy + e_gdppc + reside_in_country_coded:decline_growth_episode +
                  as.factor(e_regionpol_6C) + as.factor(year) + as.factor(indicator),
                clusters = country_id, 
                data = subset(coder_level_ds_subset_pooled, year >=2010))

summary(m1)
m1$nobs


## Table J1 Supplementary Appendix ##
texreg(list(m1), 
       head.tag = TRUE, body.tag = TRUE,
       digits = 3,
       omit.coef=c('year'), 
       
       custom.coef.names = c("Intercept",
                             "Respondent's Gender", 
                             "Respondent's Age: Oldest Cohort", 
                             "Respondent: PhD education", 
                             "Respondent: Government employed", 
                             "Respondent: Reside or Born in country", 
                             "Growth Episode", 
                             "No Episode", 
                             "Respondent supports freemarket", 
                             "Respondent supports electoral democracy", 
                             "Respondent supports liberal democracy", 
                             "time spent for coding", 
                             "Country liberal component", 
                             "Country electoral democracy level", 
                             "GDP pc", 
                             "Latin America and the Caribbean", 
                             "MENA",
                             "SSA", 
                             "Western Europe and North America",
                             "Asia and Pacific",
                             "Dummy Freedom to research and teach", 
                             "Dummy Institutional Autonomy", 
                             "Dummy Campus Integrity", 
                             "Dummy Freedom of academic and cultural expression", 
                             "Respondent supports liberal democracy * LibDem", 
                             "Respondent supports electoral democracy * EDI", 
                             "Reside in Country * Growth Episode", 
                             "Reside in Country * No Episode"),
       booktabs = TRUE,
       use.packages = FALSE,
       caption = "Linear Models predicting respondents rating with interaction between residing in country and growth and decline episodes",
       fontsize = "scriptsize",
       stars = c(0.001, 0.01, 0.05), 
       label = "Table:J1", 
       longtable = TRUE)


m1_ame <- avg_comparisons(m1,  variables = list(reside_in_country_coded = c("Not reside in country", "Reside in country")), 
                          by = "decline_growth_episode")

f1a <- ggplot(m1_ame, aes(x = decline_growth_episode, y = estimate, ymin = conf.low, ymax = conf.high)) +
  geom_point() +
  geom_linerange() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  theme_bw() +
  ggtitle("") +
  labs(color= "", fill = "",
       y= "Average Marginal Effect",
       x = "Contrast between Reside in country and Not reside in country") + 
  theme(legend.position="bottom") +
  scale_color_grey() +
  scale_fill_grey()


f1b <- plot_predictions(m1, condition = c("reside_in_country_coded", "decline_growth_episode"))+
  theme_bw() +
  ggtitle("") +
  labs(color= "", fill = "",
       y= "Predicted Values",
       x = "Local and Non-local Experts") + 
  theme(legend.position="bottom") +
  scale_color_grey() +
  scale_fill_grey()


ggarrange(f1a, f1b, labels = c("A", "B"))

ggsave("outputs_simulated_data/Figure_J3.pdf", width = 20,
       height = 10,
       units = c("cm"), dpi = 1000)
ggsave("outputs_simulated_data/Figure_J3.png", width = 20,
       height = 10,
       units = c("cm"), dpi = 1000)

################################################################################
######## Coder Quality between First- and Non-First Time Coders ################
################################################################################

#### Number of Observations for AFI ####

v2cafexch <-
  list.files(path = "../data/vdem_v14/coder_scores_post_mm/v2cafexch", pattern = "\\.csv$") %>%
  map_df(~read.csv(paste0("../data/vdem_v14/coder_scores_post_mm/v2cafexch/",.))) %>% 
  dplyr::filter(coder_id %in% c("coder_joined")) %>%
  dplyr::select(-c(mean, priors, z, osp))

v2cafres <-
  list.files(path = "../data/vdem_v14/coder_scores_post_mm/v2cafres", pattern = "\\.csv$") %>%
  map_df(~read.csv(paste0("../data/vdem_v14/coder_scores_post_mm/v2cafres/",.))) %>%
  dplyr::filter(coder_id %in% c("coder_joined")) %>%
  dplyr::select(-c(mean, priors, z, osp))

v2cainsaut <-
  list.files(path = "../data/vdem_v14/coder_scores_post_mm/v2cainsaut", pattern = "\\.csv$")%>%
  map_df(~read.csv(paste0("../data/vdem_v14/coder_scores_post_mm/v2cainsaut/",.))) %>%
  dplyr::filter(coder_id %in% c("coder_joined")) %>%
  dplyr::select(-c(mean, priors, z, osp))

v2casurv <-
  list.files(path = "../data/vdem_v14/coder_scores_post_mm/v2casurv", pattern = "\\.csv$")%>%
  map_df(~read.csv(paste0("../data/vdem_v14/coder_scores_post_mm/v2casurv/",.))) %>%
  dplyr::filter(coder_id %in% c("coder_joined")) %>%
  dplyr::select(-c(mean, priors, z, osp))

v2clacfree <-
  list.files(path = "../data/vdem_v14/coder_scores_post_mm/v2clacfree", pattern = "\\.csv$")%>%
  map_df(~read.csv(paste0("../data/vdem_v14/coder_scores_post_mm/v2clacfree/",.))) %>%
  dplyr::filter(coder_id %in% c("coder_joined"))%>%
  dplyr::select(-c(mean, priors, z, osp))

# Pivot longer, so each coder ID is in one row

v2cafexch <- v2cafexch %>%
  pivot_longer(cols = -coder_id, names_to = "coderid", values_to = "coder_joined") %>%
  dplyr::select(-coder_id) %>%
  rename(coder_id = coderid) %>%
  drop_na(coder_joined)
  
v2cafres <- v2cafres %>%
  pivot_longer(cols = -coder_id, names_to = "coderid", values_to = "coder_joined") %>%
  dplyr::select(-coder_id) %>%
  rename(coder_id = coderid) %>%
  drop_na(coder_joined)

v2cainsaut <- v2cainsaut %>%
  pivot_longer(cols = -coder_id, names_to = "coderid", values_to = "coder_joined") %>%
  dplyr::select(-coder_id) %>%
  rename(coder_id = coderid) %>%
  drop_na(coder_joined)

v2casurv <- v2casurv %>%
  pivot_longer(cols = -coder_id, names_to = "coderid", values_to = "coder_joined") %>%
  dplyr::select(-coder_id) %>%
  rename(coder_id = coderid) %>%
  drop_na(coder_joined)

v2clacfree <- v2clacfree %>%
  pivot_longer(cols = -coder_id, names_to = "coderid", values_to = "coder_joined") %>%
  dplyr::select(-coder_id) %>%
  rename(coder_id = coderid) %>%
  drop_na(coder_joined)

coder_database <- rbind(v2cafexch, v2cafres, v2cainsaut, v2casurv, v2clacfree)

coder_database <- coder_database %>%
  dplyr::group_by(coder_id) %>%
  summarize(coder_joined = first(coder_joined))

first_time_coders_13 <- coder_database %>%
  filter(coder_joined == "v13") %>%
  dplyr::select(coder_id)

first_time_coders_13 <- first_time_coders_13$coder_id

#### Descriptive Statistics ####

coder_level_ds <- coder_level_ds %>%
  mutate(new_coder = ifelse(coder_id %in% first_time_coders_13, 1, 0))

time_spent_statistics  <- coder_level_ds %>%
  filter(!is.na(v2cafres) | !is.na(v2cafexch) | !is.na(v2casurv) | !is.na(v2cainsaut) | !is.na(v2clacfree)) %>%
  mutate(year = lubridate::year(historical_date)) %>%
  filter(year >=1900) %>%
  dplyr::select(country_text_id, country_id, historical_date, coder_id,
                v2zztimespent, new_coder, v2cafres, v2cafexch, v2casurv,v2cainsaut,v2clacfree) %>%
  mutate(coder_id = as.character(coder_id)) %>%
  group_by(coder_id) %>%
  summarize(v2zztimespent = first(v2zztimespent), 
            new_coder = first(new_coder))

time_spent_statistics <- time_spent_statistics %>%
  mutate(new_coder = ifelse(new_coder == 1, "First time coder", "Multiple time coder")) %>%
  group_by(new_coder) %>%
  summarize(mean = mean(v2zztimespent, na.rm = T), 
            median = median(v2zztimespent, na.rm = T), 
            sd = sd(v2zztimespent, na.rm = T), 
            max = max(v2zztimespent, na.rm = T), 
            min = min(v2zztimespent, na.rm = T), 
            number_na = sum(is.na(v2zztimespent)))


setwd("P:/PIPM/7. VWS Projekt Academic Freedom Index/Replikationsdateien Paper/replication_files_quality_assessment_of_the_academic_freedom_index")

tinytable::tt(as.data.frame(time_spent_statistics),  digits = 3) %>% save_tt("outputs_simulated_data/Table_J1.tex", overwrite = T)


#### Join new information into the coder_level_ds_subset_pooled ####

vdm_subset_v13_afi_indicators <- vdem_full_v13 %>%
  dplyr::select(country_id, historical_date, v2cafres_osp, v2cafexch_osp, v2casurv_osp, v2cainsaut_osp, v2clacfree_osp)

coder_level_ds_subset <- coder_level_ds_subset %>%
  left_join(vdm_subset_v13_afi_indicators, by = c("country_id", "historical_date")) %>%
  mutate(diff_v2cafres = v2cafres - v2cafres_osp, 
         diff_v2cafexch = v2cafexch - v2cafexch_osp, 
         diff_v2casurv = v2casurv - v2casurv_osp,
         diff_v2cainsaut = v2cainsaut - v2cainsaut_osp,
         diff_v2clacfree = v2clacfree - v2clacfree_osp)

summary(coder_level_ds_subset$diff_v2cafres)
summary(coder_level_ds_subset$diff_v2cafexch)
summary(coder_level_ds_subset$diff_v2casurv)
summary(coder_level_ds_subset$diff_v2cainsaut)
summary(coder_level_ds_subset$diff_v2clacfree)

coder_level_ds_subset_pooled <- coder_level_ds_subset %>%
  dplyr::select(country_text_id, country_id, coder_id, historical_date, year, v2cafres, v2cafexch, v2cainsaut, v2casurv, v2clacfree,
                v2zzfremrk, v2zzelcdem, v2zzlibdem, v2zztimespent, v2zzsatisf, v2zzfirstreas, v2x_polyarchy, v2x_liberal, v2x_libdem,
                e_regionpol_6C,e_gdppc, e_pop, e_cow_exports,e_cow_imports, gender, age_block, phd, government_employment,reside_in_country, 
                diff_v2cafres, diff_v2cafexch, diff_v2casurv, diff_v2cainsaut, diff_v2clacfree) %>%
  pivot_longer(!c(country_text_id, country_id, coder_id, historical_date, year,v2zzfremrk, v2zzelcdem, v2zzlibdem, v2zztimespent, 
                  v2zzsatisf, v2zzfirstreas,  v2x_polyarchy, v2x_liberal, v2x_libdem, e_regionpol_6C,e_gdppc, e_pop,
                  e_cow_exports,e_cow_imports, gender, age_block, phd, government_employment, reside_in_country, 
                  v2cafres, v2cafexch, v2cainsaut, v2casurv, v2clacfree,), names_to = "indicator", values_to = "coder_value_indicator") %>%
  drop_na(coder_value_indicator) %>%
  mutate(gender = case_when(gender == 0 ~ "Male", gender == 1 ~ "Female"), 
         age_block = case_when(age_block == 0 ~ "First Block", 
                               age_block == 1 ~ "Second Block", 
                               age_block == 2 ~ "Third Block"), 
         phd = case_when(phd == 0 ~ "No PhD", phd == 1 ~ "PhD"), 
         government_employment = case_when(government_employment == 0 ~ "Government employed", government_employment == 1 ~ "No government employed"), 
         reside_in_country_coded = case_when(reside_in_country == 0 ~ "Not reside in country", reside_in_country == 1 ~ "Reside in country"))

coder_level_ds_subset_pooled <- coder_level_ds_subset_pooled %>%
  mutate(new_coder = ifelse(coder_id %in% first_time_coders_13, "New Coder", "Multiple Time Coder"))
coder_level_ds_subset_pooled$new_coder <- as.factor(coder_level_ds_subset_pooled$new_coder)

summary(coder_level_ds_subset_pooled)

## pooled regression analysis ##
m2 <- lm_robust(coder_value_indicator ~ gender + age_block + phd + government_employment + reside_in_country_coded +
                  new_coder + v2zzfremrk + v2zzelcdem + v2zzlibdem + v2zztimespent + v2x_liberal + v2x_polyarchy + 
                  v2zzlibdem:v2x_liberal + v2zzelcdem:v2x_polyarchy + e_gdppc +  +
                  as.factor(e_regionpol_6C) + as.factor(year) + as.factor(indicator),
                clusters = country_id, 
                data = coder_level_ds_subset_pooled)

summary(m2)
m2$nobs


## Table J2 Supplementary Appendix ##
texreg(list(m2), 
       head.tag = TRUE, body.tag = TRUE,
       digits = 3,
       omit.coef=c('year'), 
       file = "outputs_simulated_data/Table_J2.tex",
       custom.coef.names = c("Intercept",
                             "Respondent's Gender", 
                             "Respondent's Age: Oldest Cohort", 
                             "Respondent: PhD education", 
                             "Respondent: Government employed", 
                             "Respondent: Reside in country", 
                             "Respondent: New Coder", 
                             "Respondent supports freemarket", 
                             "Respondent supports electoral democracy", 
                             "Respondent supports liberal democracy", 
                             "time spent for coding", 
                             "Country liberal component", 
                             "Country electoral democracy level",
                             "Respondent supports liberal democracy * LibDem", 
                             "Respondent supports electoral democracy * EDI",
                             "GDP pc", 
                             "Latin America and the Caribbean", 
                             "MENA",
                             "SSA", 
                             "Western Europe and North America",
                             "Asia and Pacific",
                             "Dummy Freedom to research and teach", 
                             "Dummy Institutional Autonomy", 
                             "Dummy Campus Integrity", 
                             "Dummy Freedom of academic and cultural expression"),
       booktabs = TRUE,
       use.packages = FALSE,
       caption = "Linear Models predicting respondents rating with interaction between residing in country and growth and decline episodes",
       fontsize = "scriptsize",
       stars = c(0.001, 0.01, 0.05), 
       label = "Table:J2", 
       longtable = TRUE)


m2_ame <- avg_comparisons(m2,  variables = list(new_coder = c("New Coder", "Multiple Time Coder")))

f2a <- ggplot(m2_ame, aes(x = contrast, y = estimate, ymin = conf.low, ymax = conf.high)) +
  geom_point() +
  geom_linerange() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  theme_bw() +
  ggtitle("") +
  labs(color= "", fill = "",
       y= "Average Marginal Effect",
       x = "Contrast") + 
  theme(legend.position="bottom") +
  scale_color_grey() +
  scale_fill_grey()


f2b <- plot_predictions(m2, condition = c("new_coder"))+
  theme_bw() +
  ggtitle("") +
  labs(color= "", fill = "",
       y= "Predicted Deviation from final data",
       x = "New Coders and Multiple Time Coders") + 
  theme(legend.position="bottom") +
  scale_color_grey() +
  scale_fill_grey()

ggarrange(f2a, f2b, labels = c("A", "B"))

ggsave("outputs_simulated_data/Figure_J4.pdf", width = 20,
       height = 10,
       units = c("cm"), dpi = 1000)
ggsave("outputs_simulated_data/Figure_J4.png", width = 20,
       height = 10,
       units = c("cm"), dpi = 1000)




  
